rm(list = ls(all.names = TRUE)) 
gc() 

data_robust <- 
  read.csv(file = "intermediary_outputs/robustness/robust_base_with_connection_data.csv")

descrip_robust <- function(var_list) {
  counter = 0
  groups_list <- list("flag_interview", "flag_grades", "flag_one", "flag_two", "flag", "flag_class_70_10", 
                      "flag_class_80_10", "flag_class_50_10", "flag_class_70_2", "flag_class_80_2", "flag_final")
  
  observations <- lapply(groups_list, function(x) table(data_robust[x]))
  
  inviduals_1 <- lapply(groups_list, function(x) which(data_robust[,x] == 1 ))
  inviduals_0 <- lapply(groups_list, function(x) which(data_robust[,x] == 0 ))
  
  compute_values <- function(individuals, variable) {
    #individuals = inviduals_0[[2]]
    #variable = "grades_score"
    avg <- mean(data_robust[individuals, variable], na.rm = T)
    sd <-  sd(data_robust[individuals, variable], na.rm = T)
    missing <- mean(is.na(data_robust[individuals, variable]))
    
    return(c(avg, sd, missing))
    
  }
  
  for (var_of_int in var_list) {
    # var_of_int = c("grades_score")
    
    counter = counter + 1
    
    results_0 <- lapply(inviduals_0, compute_values, variable = var_of_int)
    results_1 <- lapply(inviduals_1, compute_values, variable = var_of_int)
    
    table_ind <- matrix(data = c(results_0[[1]], results_1[[1]], 
                                 results_0[[2]], results_1[[2]],
                                 results_0[[5]], results_1[[5]]), ncol = 6)
    
    table_ind <- matrix(data = c(results_0[[1]], results_1[[1]]), ncol = 2)
    table_ind <- matrix(data = c(results_0[[1]], results_1[[1]]), ncol = 2)
    
    results_var <- mapply(function(x0,x1) matrix(data = c(x0,x1), ncol =2 ), results_0, results_1, SIMPLIFY = F)
    
    table_ind <- round(cbind(results_var[[1]], results_var[[2]], results_var[[4]],results_var[[5]]),2)
    rownames(table_ind) <- c(var_of_int, "SD", "Share Missing")
    
    table_class <- round(cbind(results_var[[6]], results_var[[7]], results_var[[8]], results_var[[11]]),2)
    rownames(table_class) <- c(var_of_int, "SD", "Share Missing")
    
    if (counter == 1) {
      # Prepare individual criteria
      table_final_ind <- matrix(data = c("No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", 
                                         t(observations[[1]]), t(observations[[2]]), t(observations[[4]]), t(observations[[5]])),
                                nrow = 2, byrow = T)
      
      rownames(table_final_ind) <- c("", "Observations")
      colnames(table_final_ind) <- c("Missing Interview", "", "Missing Grades","", "Abandoned School", "", "Not valid", "")
      
      table_final_ind <- rbind(table_final_ind, table_ind)
      
      # Prepara class criterium
      table_final_class <- matrix(data = c("No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", 
                                           t(observations[[6]]), t(observations[[7]]), t(observations[[8]]), t(observations[[11]])),
                                  nrow = 2, byrow = T)
      
      rownames(table_final_class) <- c("", "Observations")
      colnames(table_final_class) <- c("70\\% Valid", "", "80\\% Valid", "", "50\\% Valid", "", "Final Sample", "")
      
      table_final_class <- rbind(table_final_class, table_class)
      
    } else {
      
      table_final_ind <- rbind(table_final_ind, table_ind)
      table_final_class <- rbind(table_final_class, table_class)
    }
    
  }
  
  return(list(table_final_ind, table_final_class))
  
  
}


variables_to_teste_class <- c("grades_score", "ssi_same_negro_na_2", 
                              "ssi_other_race_negro_na_2", "negro_na", 
                              "skin_color", "age", "age_grade_distortion", 
                              "sh_nw_class", "scores_poverty",
                              "score_neighborhood_quality")

### Table A3 - Descriptive Statistics for eliminated classes
samples_ind <- descrip_robust(variables_to_teste_class)[[1]]
xtable(samples_ind)

samples_class <- descrip_robust(variables_to_teste_class)[[2]]
xtable(samples_class)


#### Regressions

## Define groups of covariates for regressions: --------
ind_controls <- paste("male", "factor(religion_simple)", 
                      "age_grade_distortion", "score_self_esteem", 
                      "score_parents_support", "score_study", sep = "+")

ses_controls <- paste("scores_poverty", "score_neighborhood_quality", 
                      "factor(occ_father_simple_no_job_no_na)",
                      "factor(occ_mother_simple_no_job_no_na)", sep = "+")

class_fe <- paste("factor(class_id)")

source("09a_supp_function_regressions_en.R", encoding = "UTF-8")

regs_cond_1 <- function(dep_var, race_var, grade_var, condition) {
  # dep_var = "ssi_same_negro_na_2_std"
  # race_var = "factor(negro_na)"
  # grade_var = "grades_score"
  
  # basic regression
  basic_reg <- paste0(dep_var, "~", paste0(race_var,"*", grade_var))
  
  Form_basic <- formula(basic_reg)
  
  # Individual control
  Form_ind <- formula(paste(basic_reg, ind_controls, sep = "+"))
  
  # SES control
  Form_ses <- formula(paste(basic_reg, ses_controls, sep = "+"))
  
  # Individual + SES control
  Form_ind_ses <- formula(paste(basic_reg, ind_controls, 
                                ses_controls, sep = "+"))
  
  # Individual controls + Classroom FE
  Form_ind_class_fe <- formula(paste(basic_reg, ind_controls, 
                                     class_fe, sep = "+"))
  
  # SES controls + Classroom FE
  Form_ses_class_fe <- formula(paste(basic_reg, ses_controls, 
                                     class_fe, sep = "+"))
  
  # Ind. + SES controls + Classroom FE
  Form_ind_ses_class_fe <- formula(paste(basic_reg, ind_controls,
                                         ses_controls, class_fe, 
                                         sep = "+"))
  
  list_specifications <- c(Form_basic, Form_ind, Form_ses, Form_ind_ses,
                           Form_ind_class_fe, Form_ses_class_fe, 
                           Form_ind_ses_class_fe) 
  
  results <- lapply(list_specifications, function(x) (lm(x, data = data_robust, 
                                                         subset = (race < 4
                                                                   & condition))))
  results_r <- lapply(results, robust_se, "HC3")
  #  results_c <- lapply(results, cluster_se, "class_id")
  results_c <- NA
  
  results_final <- list(results, results_r, results_c)  
  return(results_final)
}

## Observing results for same race: 70/10 -----

result_same_race_bi_70_10 <- regs_cond_1(dep_var = "ssi_same_negro_na_2_std", 
                                         race_var = "negro_na", 
                                         grade_var = "grades_score",
                                         condition = (data_robust$flag_class_70_10 == 0))

result_all_race_bi_70_10 <- regs_cond_1(dep_var = "ssi_all_race_2_std", 
                                        race_var = "negro_na", 
                                        grade_var = "grades_score",
                                        condition = (data_robust$flag_class_70_10 == 0))

result_other_race_bi_70_10 <- regs_cond_1(dep_var = "ssi_other_race_negro_na_2_std", 
                                          race_var = "negro_na", 
                                          grade_var = "grades_score",
                                          condition = (data_robust$flag_class_70_10 == 0))

## Observing results for same race: 80/10 ---------- 
# Panel (a) - Table C9
result_same_race_bi_80_10 <- regs_cond_1(dep_var = "ssi_same_negro_na_2_std", 
                                         race_var = "negro_na", 
                                         grade_var = "grades_score",
                                         condition = (data_robust$flag_class_80_10 == 0))
result_all_race_bi_80_10 <- regs_cond_1(dep_var = "ssi_all_race_2_std", 
                                        race_var = "negro_na", 
                                        grade_var = "grades_score",
                                        condition = (data_robust$flag_class_80_10 == 0))
result_other_race_bi_80_10 <- regs_cond_1(dep_var = "ssi_other_race_negro_na_2_std", 
                                          race_var = "negro_na", 
                                          grade_var = "grades_score",
                                          condition = (data_robust$flag_class_80_10 == 0))


## Observing results for same race: 50/10
# Panel (b) - Table C9
result_same_race_bi_50_10 <- regs_cond_1(dep_var = "ssi_same_negro_na_2_std", 
                                         race_var = "negro_na", 
                                         grade_var = "grades_score",
                                         condition = (data_robust$flag_class_50_10 == 0))
result_all_race_bi_50_10 <- regs_cond_1(dep_var = "ssi_all_race_2_std", 
                                        race_var = "negro_na", 
                                        grade_var = "grades_score",
                                        condition = (data_robust$flag_class_50_10 == 0))
result_other_race_bi_50_10 <- regs_cond_1(dep_var = "ssi_other_race_negro_na_2_std", 
                                          race_var = "negro_na", 
                                          grade_var = "grades_score",
                                          condition = (data_robust$flag_class_50_10 == 0))

variables_to_omit <- "(age)|(male)|(religion)|(class_id)|(avg_)|(class)|(school)|(missing)|(EM)|
                      |(occ_)|(score_)|(poverty)|(Constant)|(supply)|(n_)|(occ_pai_)"

list_regs <- list(result_same_race_bi_50_10[[1]][[7]], 
                  result_same_race_bi_70_10[[1]][[7]], 
                  result_same_race_bi_80_10[[1]][[7]],
                  result_other_race_bi_50_10[[1]][[7]], 
                  result_other_race_bi_70_10[[1]][[7]], 
                  result_other_race_bi_80_10[[1]][[7]],
                  result_all_race_bi_50_10[[1]][[7]], 
                  result_all_race_bi_70_10[[1]][[7]], 
                  result_all_race_bi_80_10[[1]][[7]])

list_regs_c <- list(result_same_race_bi_50_10[[2]][[7]], 
                    result_same_race_bi_70_10[[2]][[7]], 
                    result_same_race_bi_80_10[[2]][[7]],
                    result_other_race_bi_50_10[[2]][[7]], 
                    result_other_race_bi_70_10[[2]][[7]], 
                    result_other_race_bi_80_10[[2]][[7]],
                    result_all_race_bi_50_10[[2]][[7]], 
                    result_all_race_bi_70_10[[2]][[7]], 
                    result_all_race_bi_80_10[[2]][[7]])

n_obs <- sapply(list_regs, get_obs)

r_adj <- sapply(list_regs, get_adj_r)

stargazer(list_regs_c, 
          column.separate = c(3,3,3),
          covariate.labels = ,
          dep.var.labels.include = F,
          dep.var.caption = "Social status",
          column.labels = c("Same race", "Other races", "All races"),
          omit = variables_to_omit, 
          omit.stat = c("f", "rsq","ser"),
          add.lines = list(c("Cutoff", "50\\%", "70\\%", "80\\%", "50\\%", "70\\%", "80\\%", "50\\%", "70\\%", "80\\%"),
                           c("Observation", n_obs),
                           c("Adjusted R2", r_adj)),
          column.sep.width = "0pt",
          font.size = "tiny", type = "text")


cov_var_labels <- c("Nonwhite", "Grades", "Nonwhite*Grades")

stargazer(list_regs_c, 
          column.separate = c(3,3,3),
          covariate.labels = cov_var_labels,
          dep.var.labels.include = F,
          dep.var.caption = "Social status",
          column.labels = c("Same race", "Other races", "All races"),
          omit = variables_to_omit, 
          omit.stat = c("f", "rsq","ser"),
          add.lines = list(c("Cutoff", "50\\%", "70\\%", "80\\%", "50\\%", "70\\%", "80\\%", "50\\%", "70\\%", "80\\%"),
                           c("Observation", n_obs),
                           c("Adjusted R2", r_adj)),
          column.sep.width = "0pt",
          font.size = "footnotesize", type = "latex", 
          out = "tables/robustness_sample_lin.tex",
          label = "robustness_sample_lin",
          title = "Robustness of results to difference samples - Linear specification")

